library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(gapminder)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
Question1
mydata <- read.csv("/Users/rohin/Desktop/NYRestaurantInspection2022.csv")
as_tibble(mydata)
## # A tibble: 240,610 × 27
## CAMIS DBA BORO BUILDING STREET ZIPCODE PHONE CUISINE.DESCRIPTION
## <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 50117016 "NO CHEWING… Manh… 1802 65TH … "" 9178… ""
## 2 50116677 "MAMAN" Broo… 154 COURT… "11201" 7184… ""
## 3 50126777 "TANG MUSIC… Quee… 5530 58TH … "11378" 3478… ""
## 4 50111558 "LEGENDS HO… Manh… 1 INTRE… "" 9172… ""
## 5 50116856 "LA PECORA … Manh… 817 WASHI… "10014" 2128… ""
## 6 50108825 "AMPLE HILL… Broo… 1 WATER… "11201" 3478… ""
## 7 50075103 "PQR" Manh… 1631 2 AVE… "10028" 9174… "Pizza"
## 8 50127765 "" Manh… 695 PARK … "10065" 3478… ""
## 9 50124776 "" Manh… 12 PARK … "10016" 9292… ""
## 10 50110734 "Chelsea Ta… Manh… 152 WEST … "10001" 9175… ""
## # ℹ 240,600 more rows
## # ℹ 19 more variables: INSPECTION.DATE <chr>, ACTION <chr>,
## # VIOLATION.CODE <chr>, VIOLATION.DESCRIPTION <chr>, CRITICAL.FLAG <chr>,
## # SCORE <int>, GRADE <chr>, GRADE.DATE <chr>, RECORD.DATE <chr>,
## # INSPECTION.TYPE <chr>, Latitude <dbl>, Longitude <dbl>,
## # Community.Board <int>, Council.District <int>, Census.Tract <int>,
## # BIN <int>, BBL <dbl>, NTA <chr>, Location.Point <lgl>
mydata1 <- mydata %>% filter(BORO=="Queens", CUISINE.DESCRIPTION=="Pizza")
mydata1 %>% group_by(DBA) %>% summarize(count=n()) %>% arrange(desc(count)) %>% slice(1:5)
## # A tibble: 5 × 2
## DBA count
## <chr> <int>
## 1 DOMINO'S 130
## 2 PAPA JOHN'S PIZZA 69
## 3 PAPA JOHN'S 68
## 4 DOMINOS 48
## 5 ROSA'S PIZZA 48
mydata1%>%filter( DBA=="SUSANO'S PIZZERIA & RESTAURANT")%>%group_by(INSPECTION.DATE)%>% select(c(INSPECTION.DATE))%>%print(n=Inf)
## # A tibble: 17 × 1
## # Groups: INSPECTION.DATE [5]
## INSPECTION.DATE
## <chr>
## 1 07/31/2019
## 2 07/31/2019
## 3 07/31/2019
## 4 05/05/2022
## 5 12/09/2019
## 6 07/31/2019
## 7 08/14/2019
## 8 07/31/2019
## 9 07/31/2019
## 10 01/08/2020
## 11 05/05/2022
## 12 05/05/2022
## 13 12/09/2019
## 14 12/09/2019
## 15 12/09/2019
## 16 01/08/2020
## 17 05/05/2022
Question2
mydata2 <- read.delim("/Users/rohin/Desktop/gapminder_2007_gini.tsv")
mydata2 %>% ggplot() + geom_boxplot(aes(continent,gini,color=continent),outlier.colour="red", outlier.shape=16,outlier.size=3, notch=FALSE)+ggtitle("Gini Index in all continents")
ggplotly()
mydata2%>% ggplot(aes(gini, lifeExp, color = continent,size=pop,label=country)) + geom_point() +ggtitle("life expectancy V gini index")+ facet_wrap(~continent)
ggplotly()
mydata2%>%group_by(continent)%>% summarize(minimum = min(gini, na.rm = TRUE), maximum = max(gini, na.rm = TRUE), mean = mean(gini, na.rm = TRUE))
## # A tibble: 5 × 4
## continent minimum maximum mean
## <chr> <dbl> <dbl> <dbl>
## 1 Africa 30.8 63.2 43.9
## 2 Americas 32.1 60.8 48.2
## 3 Asia 29.6 49 40.2
## 4 Europe 23.7 40.2 30.5
## 5 Oceania 30.3 36.2 33.2
#(2b) There is a clear correlation between life expectancy and gini index. From the plot we can see that countries with lower gini index have higher life expectancy rate and countries with higher gini index have lower life expectancy rate.
Question 3
gdp1 <- mutate(gapminder, gdp = pop*gdpPercap)
head(gdp1)
## # A tibble: 6 × 7
## country continent year lifeExp pop gdpPercap gdp
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779. 6567086330.
## 2 Afghanistan Asia 1957 30.3 9240934 821. 7585448670.
## 3 Afghanistan Asia 1962 32.0 10267083 853. 8758855797.
## 4 Afghanistan Asia 1967 34.0 11537966 836. 9648014150.
## 5 Afghanistan Asia 1972 36.1 13079460 740. 9678553274.
## 6 Afghanistan Asia 1977 38.4 14880372 786. 11697659231.
usa_gdp <- gdp1%>%filter(country=="United States",year==2007)
gdp2<- mutate(gdp1, gdp_ratio = gdp/usa_gdp$gdp)
head(gdp2)
## # A tibble: 6 × 8
## country continent year lifeExp pop gdpPercap gdp gdp_ratio
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779. 6567086330. 0.000508
## 2 Afghanistan Asia 1957 30.3 9240934 821. 7585448670. 0.000586
## 3 Afghanistan Asia 1962 32.0 10267083 853. 8758855797. 0.000677
## 4 Afghanistan Asia 1967 34.0 11537966 836. 9648014150. 0.000746
## 5 Afghanistan Asia 1972 36.1 13079460 740. 9678553274. 0.000748
## 6 Afghanistan Asia 1977 38.4 14880372 786. 11697659231. 0.000904
gdp3 <- gdp2 %>% group_by(continent, year) %>% summarize(median = median(gdp_ratio))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
gdp3 %>% ggplot(aes(year, median, color = continent)) + geom_point() + geom_line()+ggtitle("Median V GDP Ratio")
ggplotly()